set.seed(1)
required_packages <- c("tidyverse", "magrittr", "DBI", "bigrquery", "arrow","glue", "vroom","janitor", "gt", "ggwordcloud", "readxl", "ggthemes", "hrbrthemes", "extrafont", "plotly", "scales", "stringr", "gganimate", "here", "tidytext", "sentimentr", "scales", "DT", "here", "sm", "mblm", "glue", "fs", "knitr", "rmdformats", "janitor", "urltools", "colorspace", "pdftools", "showtext", "pander", "wordcloud2")
for(i in required_packages) {
if(!require(i, character.only = T)) {
# if package is not existing, install then load the package
install.packages(i, dependencies = T)
require(i, character.only = T)
}
}
panderOptions('table.alignment.default', "left")
## quality of png's
dpi <- 750
## theme updates; please adjust to client´s website
#theme_set(ggthemes::theme_clean(base_size = 15))
theme_set(ggthemes::theme_clean(base_size = 15, base_family = "Montserrat"))
theme_update(plot.margin = margin(30, 30, 30, 30),
plot.background = element_rect(color = "white",
fill = "white"),
plot.title = element_text(size = 20,
face = "bold",
lineheight = 1.05,
hjust = .5,
margin = margin(10, 0, 25, 0)),
plot.title.position = "plot",
plot.caption = element_text(color = "grey40",
size = 9,
margin = margin(20, 0, -20, 0)),
plot.caption.position = "plot",
axis.line.x = element_line(color = "black",
size = .8),
axis.line.y = element_line(color = "black",
size = .8),
axis.title.x = element_text(size = 16,
face = "bold",
margin = margin(t = 20)),
axis.title.y = element_text(size = 16,
face = "bold",
margin = margin(r = 20)),
axis.text = element_text(size = 11,
color = "black",
face = "bold"),
axis.text.x = element_text(margin = margin(t = 10)),
axis.text.y = element_text(margin = margin(r = 10)),
axis.ticks = element_blank(),
panel.grid.major.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.major.y = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.y = element_blank(),
panel.spacing.x = unit(4, "lines"),
panel.spacing.y = unit(2, "lines"),
legend.position = "top",
legend.title = element_text(family = "Montserrat",
color = "black",
size = 14,
margin = margin(5, 0, 5, 0)),
legend.text = element_text(family = "Montserrat",
color = "black",
size = 11,
margin = margin(4.5, 4.5, 4.5, 4.5)),
legend.background = element_rect(fill = NA,
color = NA),
legend.key = element_rect(color = NA, fill = NA),
#legend.key.width = unit(5, "lines"),
#legend.spacing.x = unit(.05, "pt"),
#legend.spacing.y = unit(.55, "pt"),
#legend.margin = margin(0, 0, 10, 0),
strip.text = element_text(face = "bold",
margin = margin(b = 10)))
## theme settings for flipped plots
theme_flip <-
theme(panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_line(size = .6,
color = "#eaeaea"))
## theme settings for maps
theme_map <-
theme_void(base_family = "Montserrat") +
theme(legend.direction = "horizontal",
legend.box = "horizontal",
legend.margin = margin(10, 10, 10, 10),
legend.title = element_text(size = 17,
face = "bold"),
legend.text = element_text(color = "grey33",
size = 12),
plot.margin = margin(15, 5, 15, 5),
plot.title = element_text(face = "bold",
size = 20,
hjust = .5,
margin = margin(30, 0, 10, 0)),
plot.subtitle = element_text(face = "bold",
color = "grey33",
size = 17,
hjust = .5,
margin = margin(10, 0, -30, 0)),
plot.caption = element_text(size = 14,
color = "grey33",
hjust = .97,
margin = margin(-30, 0, 0, 0)))
## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)
## main color backlinko
bl_col <- "#00d188"
bl_dark <- darken(bl_col, .3, space = "HLS")
## colors + labels for interval stripes
int_cols <- c("#bce2d5", "#79d8b6", bl_col, "#009f66", "#006c45", "#003925")
int_perc <- c("100%", "95%", "75%", "50%", "25%", "5%")
## colors for degrees (Bachelors, Massters, Doctorate in reverse order)
cols_degree <- c("#e64500", "#FFCC00", darken(bl_col, .1))
## gradient colors for position
colfunc <- colorRampPalette(c(bl_col, "#bce2d5"))
pos_cols <- colfunc(10)display_number <- function(n, title){
tibble(!!sym(title) := format(n, big.mark = ",")) %>% pander()
}con <- dbConnect(
bigrquery::bigquery(),
project = "dataforseo-bigquery",
billing = "dataforseo-bigquery"
)
sql <- glue("SELECT * FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword_info_search_volume > 0
ORDER BY keyword_info_search_volume DESC
LIMIT 50000")
tb <- bq_project_query("dataforseo-bigquery", sql)
top <- bq_table_download(tb, max_results = 50000)!!!J: In this version I remove all the entries with zero volume for all stats. I think if we prefer to go by count rather than by volume, this is the best approach.
!!!D: Sounds good. I trust you on this. Please make sure to add a note to the methodology section when moving the files.
SELECT COUNT(keyword_info_search_volume) as total_count
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
AND keyword_info_search_volume > 0total_count <- sql$total_count
tibble("Total number of searches" = glue("~{format(round(total_count / 1000000))} million")) %>%
pander()| Total number of searches |
|---|
| ~306 million |
SELECT SUM(COALESCE(keyword_info_search_volume / 10000, 0)) AS total_volume
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840# Calculated in a roundabout way to avoid integer overflow
total_volume <- sql$total_volume * 10000
tibble("Total volume of searches" = glue("~{format(round(total_volume / 1000000000))} billion")) %>%
pander()| Total volume of searches |
|---|
| ~303 billion |
This table shows the top 10 searches. They are all spelling errors. As in, they are not really searched, but rather people attempting to go to Youtube or Facebook, but typing it wrong. Oddly they are all attributed as having a search volume of exactly 185 million.
tbl <- top %>%
select(keyword, location, spell, spell_type, keyword_info_search_volume) %>%
head(10)
tbl %>% write_csv("../plots/csv/top_misspell.csv")
tbl %>% gt() %>%
tab_options(table.align = "left") %>%
tab_header("Top searches")| Top searches | ||||
|---|---|---|---|---|
| keyword | location | spell | spell_type | keyword_info_search_volume |
| jou tube | 2840 | youtube | showing_results_for | 185000000 |
| youtube the | 2840 | 185000000 | ||
| youi tue | 2840 | youtube | showing_results_for | 185000000 |
| acerook | 2840 | 185000000 | ||
| youetube | 2840 | 185000000 | ||
| you tbut | 2840 | youtube | did_you_mean | 185000000 |
| ykutube | 2840 | youtube | showing_results_for | 185000000 |
| uotod | 2840 | youtube | showing_results_for | 185000000 |
| utuen | 2840 | youtube | did_you_mean | 185000000 |
| ytu tube | 2840 | 185000000 | ||
SELECT COUNT(*) as missing_count
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
AND keyword_info_search_volume IS NULLmissing_count <- sql$missing_count
tibble("Missing search volume" = scales::percent(missing_count / total_count, accuracy = 0.001)) %>% pander()| Missing search volume |
|---|
| 0.975% |
The missing have some searches that are likely high volume. Thus they are truly missing, and not just 0s.
SELECT keyword, keyword_info_search_volume
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
AND keyword_info_search_volume IS NULL
ORDER BY RAND()
LIMIT 10sql %>% gt() %>% tab_options(table.align = "left") %>%
tab_header("Keywords with missing search volume")| Keywords with missing search volume | |
|---|---|
| keyword | keyword_info_search_volume |
| K periodic table | NA |
| Black Lagoon: Roberta Voice Actor | NA |
| Town population | NA |
| Flemington, NJ steakhouse | NA |
| Stretches for lower leg pain | NA |
| RAID level 2 | NA |
| Ribbed Cotton Bralette | NA |
| St Louis County Municipal Court | NA |
| La Jolla movie theater | NA |
| The Little broomstick | NA |
| Mean search volume |
|---|
| 988.6631 |
SELECT approx_quantiles(keyword_info_search_volume, 2)[offset(1)] AS median
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword_info_search_volume > 0| Median search volume |
|---|
| 10 |
SELECT AVG(`keyword_info_cpc`) AS mean_cpc
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
AND keyword_info_search_volume > 0| Mean CPC |
|---|
| 0.6069549 |
SELECT approx_quantiles(keyword_info_cpc, 2)[offset(1)] AS median_cpc
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword_info_search_volume > 0| Median CPC |
|---|
| 0 |
SELECT spell_type, SUM(keyword_info_search_volume) / 10000 AS `volume`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
GROUP BY spell_typespell_types <- sql %>%
mutate(spell_type = case_when(
spell_type == "" ~ "No spell type",
spell_type == "showing_results_for" ~ "Showing results for",
spell_type == "did_you_mean" ~ "Did you mean",
spell_type == "no_results_found_for" ~ "No results found for")) %>%
mutate(volume = volume / sum(volume))
spell_types %>% ggplot(aes(x = reorder(spell_type, volume), y = volume)) +
geom_bar(stat = "identity", width = 0.8, fill = "turquoise4", color = "black") +
labs(x = "", y = "", title = "Spell types - by volume") +
scale_y_continuous(labels = scales::percent)About half of search volume has a spell type. This is especially driven by misspellings of common domains.
If going by count instead of by volume, almost none of the searches have a spell type:
SELECT spell_type, COUNT(spell_type) as n
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
ANd keyword_info_search_volume > 0
GROUP BY spell_typeprop <- 1 - sql %>% filter(spell_type == "") %>% pull(n) / total_count
tibble("Proportion with spell type" = scales::percent(prop, accuracy = 0.001)) %>% pander()| Proportion with spell type |
|---|
| 1.389% |
spell_types <- sql %>%
filter(spell_type != "") %>%
mutate(spell_type = case_when(
spell_type == "" ~ "No spell type",
spell_type == "showing_results_for" ~ "Showing results for",
spell_type == "did_you_mean" ~ "Did you mean",
spell_type == "no_results_found_for" ~ "No results found for"))
spell_types %>% mutate(prop = n / sum(n)) %>%
ggplot(aes(x = reorder(spell_type, prop), y = prop)) +
geom_bar(stat = "identity", width = 0.8, fill = "turquoise4", color = "black") +
labs(x = "", y = "", title = "Spell types - by count") +
scale_y_continuous(labels = scales::percent)!!!D: without spell type is missing “No spell type”
!!!J: This is on purpose, since only 1.38% has a spell type, as shown just above.
tbl <- top %>% group_by(spell) %>%
summarise(volume = sum(keyword_info_search_volume)) %>%
arrange(desc(volume)) %>%
filter(spell != "") %>%
mutate(volume = scales::percent(volume / sum(volume), accuracy = 0.1)) %>%
head(10)#
tbl %>% write_csv("../plots/csv/misspell.csv")
tbl %>%
gt() %>%
tab_options(table.align = "left") %>%
tab_header("Top 10 intended searches that are misspelled")| Top 10 intended searches that are misspelled | |
|---|---|
| spell | volume |
| youtube | 35.3% |
| 8.7% | |
| amazon | 7.6% |
| 6.3% | |
| weather | 2.2% |
| translate | 1.6% |
| com | 1.5% |
| 1.3% | |
| walmart | 1.3% |
| ebay | 1.2% |
question_words <- c("what", "which", "where", "who", "why", "how")
write_questions_volume <- function(){
questions <- tribble(~question, ~volume)
for (word in question_words){
sql <- glue("SELECT sum(keyword_info_search_volume) FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword like '% {word} %' OR keyword like '{word} %' OR keyword like '% {word}'")
tb <- bq_project_query("dataforseo-bigquery", sql)
df <- bq_table_download(tb)
questions %<>% add_row(question = word, volume = df$f0_)
}
write_csv(questions, "../proc_data/questions_volume.csv")
}
#write_questions_volume()
questions <- read_csv("../proc_data/questions_volume.csv")
questions %>% mutate(prop = volume / total_volume) %>%
ggplot(aes(x = reorder(question, prop), y = prop)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black") +
scale_y_continuous(labels = scales::percent, limits = c(0, 0.006), expand = c(0,0)) +
labs(title = "Questions in searches - by volume", x = "", y = "")write_questions_count <- function(){
questions <- tribble(~question, ~n)
for (word in question_words){
sql <- glue("SELECT COUNT(keyword_info_search_volume) as n
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword_info_search_volume > 0
AND keyword like '% {word} %' OR keyword like '{word} %' OR keyword like '% {word}'")
tb <- bq_project_query("dataforseo-bigquery", sql)
df <- bq_table_download(tb)
questions %<>% add_row(question = word, n = df$n)
}
write_csv(questions, "../proc_data/questions_count.csv")
}
#write_questions_count()
questions <- read_csv("../proc_data/questions_count.csv")
questions %>% mutate(prop = n / total_count) %>%
ggplot(aes(x = reorder(question, prop), y = prop)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black") +
scale_y_continuous(labels = scales::percent, limits = c(0, 0.1), expand = c(0,0)) +
labs(title = "Questions in searches - by count", x = "", y = "")tibble("Total percentage of searches that are questions" =
scales::percent(questions %$% sum(n) / total_count, accuracy = 0.001)) %>%
pander()| Total percentage of searches that are questions |
|---|
| 14.085% |
stopword_list <- tibble(stopword = stopwords::stopwords(language = "en")) %>%
mutate(stopword = str_remove(stopword, "'")) %>%
filter(!(stopword %in% c("shed", "wed", "ill", "hell", "shell")))
get_stopwords_counts <- function(){
stopwords <- tribble(~stopword, ~n)
for (word in stopword_list$stopword){
print(word)
sql <- glue(
"SELECT COUNT(keyword_info_search_volume) AS stopword_count
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword_info_search_volume > 0
AND keyword like '% {word} %' OR keyword like '{word} %' OR keyword like '% {word}'")
tb <- bq_project_query("dataforseo-bigquery", sql)
df <- bq_table_download(tb)
stopwords %<>% add_row(stopword = word, n = df$stopword_count)
}
write_csv(stopwords, "../proc_data/stopwords.csv")
}
#get_stopwords_counts()
stopwords <- read_csv("../proc_data/stopwords.csv")
stopwords %>% mutate(prop = n / total_count) %>%
arrange(desc(prop)) %>%
head(10) %>%
ggplot(aes(x = reorder(stopword, prop), y = prop)) +
geom_bar(stat = "identity", color = "black", fill = "turquoise4", width = 0.7) +
scale_y_continuous(labels = scales::percent, expand = c(0,0), limits = c(0, 0.085)) +
labs(x = "", y = "", title = "Searches with specific stopwords") +
coord_flip()!!!D: the graph is not that insightful. Please make the wordcloud a bit more readable and visually appealing. Feel to add more than 25 words.
volume_top <- top %>%
add_rownames() %>%
mutate(rowname = as.numeric(rowname)) %>%
select(rowname, volume = keyword_info_search_volume)
ylab <- c(50, 100, 150, 200)
volume_top %>%
filter(rowname < 10000) %>%
mutate(cat = case_when(
rowname < 500 ~ "Top 500",
rowname < 2000 ~ "Top 2000",
rowname < 10000 ~ "Top 10000"
)) %>%
mutate(cat = factor(cat, levels = c("Top 500", "Top 2000", "Top 10000"))) %>%
head(10500) %>%
ggplot(aes(x = rowname, y = volume, fill = cat)) +
geom_area(alpha = 0.8) +
scale_y_continuous(
labels = glue("{ylab} M"),
breaks = 10^6 * ylab,
limits = c(0, 200* 10^6),
expand = c(0,0)
) +
labs(x = "", title = "Volume of searches", fill = "") +
annotate("text", x = 7800, y = 75*10^6, label = "Remaining 99.9967%") +
geom_segment(aes(x = 7900, y = 35*10^6, xend = 10000, yend = 35*10^6),
arrow = arrow(length = unit(0.5, "cm"))) +
scale_x_continuous(expand = c(0,0), limits = c(0, 10000)) +
ggeasy::easy_remove_axes(which = "x", what = "text")get_count_range <- function(lower, higher)
{
sql <- glue(
"SELECT COUNT(*) AS `count`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword_info_search_volume >= {lower}
AND keyword_info_search_volume <= {higher}")
tb <- bq_project_query("dataforseo-bigquery", sql)
bq_table_download(tb)$count
}
df <- tribble(
~cat, ~count,
"0 - 10", get_count_range(0, 10),
"11- 100", get_count_range(11, 100),
"101 - 1000", get_count_range(101, 1000),
"1001 - 10000", get_count_range(1001, 10000),
"10001 - 100000", get_count_range(10001, "100000"),
"100001+", get_count_range("100001", "100000000000"))
df %>%
mutate(count = count / sum(count)) %>%
ggplot(aes(x = reorder(cat, desc(count)), y = count)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black") +
theme_light() +
labs(x = "Volume", y = "Percentage of all searches", title = "Volume of searches") +
scale_y_continuous(labels = scales::percent, limits = c(0, 1), expand = c(0,0))write_length_volume <- function()
{
get_length_volume <- function(l)
{
sql <- glue(
"SELECT sum(keyword_info_search_volume) / 10000 as `volume`, count(keyword_info_search_volume) as `count`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword_info_search_volume IS NOT NULL
AND LENGTH(keyword) = {l}")
tb <- bq_project_query("dataforseo-bigquery", sql)
bq_table_download(tb) %>% mutate(length = l, volume = volume * 10000)
}
df <- map_df(1:50, get_length_volume)
write_csv(df, "../proc_data/keyword_length_volume.csv")
}df <- read_csv("../proc_data/keyword_length_volume.csv")
df %>% mutate(prop = volume / sum(volume)) %>%
ggplot(aes(x = length, y = prop)) +
geom_line(color = "turquoise4", size = 1) + geom_point(color = "turquoise4") +
theme_light() +
labs(x = "Keyword length", y = "Total search volume", title = "Search volume by keyword length") +
scale_y_continuous(labels = scales::percent) The most searched queries have length 6-9 characters, and falls continuously for search queries longer or shorter than that.
pservices <- read_csv("../raw_data/productsservices.csv") %>%
clean_names() %>% rename(c1 = criterion_id) %>% select(-category) %>%
separate(c1, sep =",\"", into = c("id", "category")) %>%
mutate(category = substr(category, 2, nchar(category) -1)) %>%
separate(category, sep = "/", into = c("cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8"))
toplevel <- pservices %>% filter(is.na(cat2))write_categories <- function()
{
get_category_volume <- function(id){
sql <- glue(
"SELECT SUM(keyword_info_search_volume) / 10000 AS `search_volume`, AVG(keyword_info_cpc) AS `cpc`, COUNT(*) AS `count`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword_info_categories like '%{id}%' ")
tb <- bq_project_query("dataforseo-bigquery", sql)
bq_table_download(tb) %>% mutate(id = id)
}
df <- map_df(toplevel$id, get_category_volume)
df %>% mutate(search_volume = search_volume * 10000,
mean_volume = search_volume / count
) %>%
left_join(toplevel %>% select(id, cat1), by = "id") %>%
write_csv("../proc_data/categories_averages.csv")
}
df <- read_csv("../proc_data/categories_averages.csv")df %>%
ggplot(aes(x = fct_rev(cat1), y = mean_volume)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
theme_light() +
coord_flip() +
scale_y_continuous(limits = c(0, 9000), expand = c(0,0)) +
labs(x = "", y = "", title = "Search volume mean by category")!!!D: I would be curious to see the same graph with median. Wouldn´t it be better to use median given the skewed data set?
!!!J: I am skeptical for using the median, since it is brought far down by a large number of searches with low volume, even when we exclude 0 volume searches. Remember, the median cpc overall was 0. But you are right that it is an issue with the skewed data set.
!!!D: Maybe a boxplot or something similiar would make more sense here? Or something like that: https://tinyurl.com/yxfwo5vt Leave that out if it´s too complicated to implement.
write_categories_volume <- function()
{
get_category_volume <- function(id){
sql <- glue(
"SELECT SUM(keyword_info_search_volume) / 10000 as volume
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword_info_search_volume > 0
AND keyword_info_categories like '%{id}%' ")
tb <- bq_project_query("dataforseo-bigquery", sql)
bq_table_download(tb) %>% mutate(id = id)
}
df <- map_df(toplevel$id, get_category_volume) %>%
mutate(volume = volume * 10000)
df %>% left_join(toplevel %>% select(id, cat1), by = "id") %>%
write_csv("../proc_data/categories_total_volume.csv")
}
write_categories_volume()
df <- read_csv("../proc_data/categories_total_volume.csv")df %>%
mutate(volume = volume / sum(volume)) %>%
ggplot(aes(x = fct_rev(cat1), y = volume)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
theme_light() +
coord_flip() +
scale_y_continuous(limits = c(0, 0.25), expand = c(0,0), labels = scales::percent) +
labs(x = "", y = "", title = "Total search volume by category")write_categories <- function()
{
get_category <- function(id){
sql <- glue(
"SELECT AVG(keyword_info_cpc) as mean_cpc
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword_info_search_volume > 0
AND keyword_info_categories like '%{id}%' ")
tb <- bq_project_query("dataforseo-bigquery", sql)
bq_table_download(tb) %>% mutate(id = id)
}
df <- map_df(toplevel$id, get_category)
df %>% left_join(toplevel %>% select(id, cat1), by = "id") %>%
write_csv("../proc_data/categories_cpc.csv")
}
write_categories()
df <- read_csv("../proc_data/categories_cpc.csv")df %>%
ggplot(aes(x = fct_rev(cat1), y = mean_cpc)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
theme_light() +
coord_flip() +
scale_y_continuous(limits = c(0, 2), expand = c(0,0)) +
labs(x = "", y = "", title = "Mean CPC by category")transactional_words <- c("apply", "buy", "coupons", "clearance", "deals", "discount",
"download", "for sale", "order", "purchase", "reserve",
"schedule appointment", "special")
informational_words <- c("how do", "how does", "how can i", "what is", "what are", "ways to", "guide",
"how to", "tutorial", "best", "cheap", "alternatives", "compare", "improve")
navigational_words <- c("location of", "near me", "features of", "cost of", "hours of", "directions to",
"reviews", "free shipping", "prices", "testimonials")
get_keyword_counts <- function(type, wordlist){
words <- tribble(~keyword, ~n)
for (word in wordlist){
print(word)
sql <- glue(
"SELECT COUNT(keyword_info_search_volume) AS count
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword_info_search_volume > 0
AND keyword like '% {word} %' OR keyword like '{word} %' OR keyword like '% {word}'")
tb <- bq_project_query("dataforseo-bigquery", sql)
df <- bq_table_download(tb)
words %<>% add_row(keyword = word, n = df$count)
}
write_csv(words %>% mutate(type = type), glue("../proc_data/{type}_words.csv"))
}
#get_keyword_counts("transactional", transactional_words)
#get_keyword_counts("informational", informational_words)
#get_keyword_counts("navigational", navigational_words)
df <- bind_rows(
read_csv("../proc_data/transactional_words.csv"),
read_csv("../proc_data/informational_words.csv"),
read_csv("../proc_data/navigational_words.csv")
)
df %>% group_by(type) %>%
summarise(prop = sum(n) / total_count) %>%
mutate(type = str_to_title(type)) %>%
ggplot(aes(x = type, y = prop)) +
geom_bar(stat = "identity", width = 0.8, fill = "turquoise4", color = "black") +
scale_y_continuous(labels = scales::percent, limits = c(0, 0.13), expand = c(0,0)) +
labs("Keyword type", y = "", x = "")!!!D: I would guess most are navigational queries. May be we can add a separate category “navigational - brands”? see a list below. What do you think?
Also, please not what the % is for keywords that don´t fit into any of the cat. and list the keywords in your descriptions for all cat.
navigational_words_brands <- c( Amazon Google Apple Microsoft Samsung Group ICBC Facebook Walmart Ping An Huawei Mercedes-Benz Verizon China Construction Bank AT&T Toyota State Grid Disney Agricultural Bank Of China WeChat Bank of China Home Depot China Mobile Shell Saudi Aramco Volkswagen YouTube Tencent Starbucks Wells Fargo BMW Deutsche Telekom Moutai PetroChina Coca-Cola Mitsubishi Group McDonald’s Taobao NTT Group Bank of America Nike Porsche Sinopec IBM CITI Honda Marlboro Deloitte Chase Tmall UPS American Express Xfinity UnitedHealthcare Sumitomo Group Intel VISA Instagram China Life accenture Allianz CSCEC PWC Lowe’s Mitsui General Electric EY Oracle Cisco BP CVS Total FedEx Netflix China Merchants Bank JP Morgan Boeing Costco SK Group Wuliangye Evergrande Nestle Hyundai Group China Telecom Siemens TATA Group Mastercard Bosch IKEA HSBC Spectrum Vodafone Pepsi Alibaba.com Ford AIA Orange Nissan Chevron GUCCI Dell Technologies